In [1]:
import numpy as np
import shutil
import os
import json
import glob
from itertools import count
from scipy import sparse
import random
import timeit
In [2]:
def get_all_files(basedir, ext='.json'):
'''
Arguments
basedir: path to datase
ext: file extension (default is .json)
Returns
files: ndarray contains filepaths to all files in the directory given by basedir
filenames: dict with keys as filenames (without extension) and value as root path to the file
'''
filenames = {}
files = np.array([])
for root, _, fnames in os.walk(basedir):
files = np.concatenate((files, glob.glob(os.path.join(root,'*'+ext))))
for fn in fnames:
filenames[os.path.splitext(fn)[0]] = root
return files, filenames
In [3]:
def get_stochastic_matrix(files, t=0):
'''
Construct the similarity (column) stochastic matrix in COOordinate format
Arguments
t: threshold for similarity strength
Return
M: stochastic column matrix
song_to_row: dict where key is track id of song and value is row index in matrix
song_to_col: dict where key is track id of song and value is col index in matrix
Note: Since M is square, for each song its corresponding row index equals the column index
'''
row = []
col = []
data = []
song_to_row = {}
song_to_col = {}
row_count = count()
col_count = count()
for file in files:
with open(file, 'r') as f:
song = json.load(f)
tid = song['track_id']
if not tid in song_to_row:
song_to_row[tid] = next(row_count)
if not tid in song_to_col:
song_to_col[tid] = next(col_count)
out_degree = len(song['similars'])
for s in song['similars']:
sid = s[0]
if not sid in song_to_col:
song_to_col[sid] = next(col_count)
if not sid in song_to_row:
song_to_row[sid] = next(row_count)
if(s[1] > t): # add edge only if the similarity score stricly greater than threshold t
row.append(song_to_row[tid])
col.append(song_to_col[sid])
if(out_degree == 0):
data.append(0)
else:
data.append(1.0/out_degree)
M = sparse.coo_matrix((data, (row, col)), shape=(len(song_to_row), len(song_to_col))) # ensure that matrix is square
M = M.T
del row, col, data, row_count, col_count
return M, song_to_row, song_to_col
In [4]:
def get_songs_to_tags_matrix(files, song_to_row, g=50):
'''
Construct the songs-to-tag matrix in COOordinate format
Arguments
tags_to_col: dict where key is tag and value is column index from matrix N
g: threshold for tag strength
Return
N: songs-to-tags matrix
'''
row = []
col = []
data = []
tags_to_col = {}
col_count = count()
for file in files:
with open(file, 'r') as f:
song = json.load(f)
tid = song['track_id']
for t in song['tags']:
tag = t[0]
cnt = t[1]
if not tag in tags_to_col:
tags_to_col[tag] = next(col_count)
if(int(cnt) >= g):
row.append(song_to_row[tid])
col.append(tags_to_col[tag])
data.append(1)
N = sparse.coo_matrix((data, (row, col)))
del row, col, data, col_count
return N, tags_to_col
In [5]:
def get_teleport_set(N, tags, user_tags):
'''
Arguments
N: songs-to-tags matrix
tags: dict with tags and column indices
user_tags: list with tags specified by user
Return
teleport_set: set with all songs containing user_tags
'''
tag_ids = [tags[t] for t in user_tags] # map user specified tags to the corresponding tag ids
teleport_set = N.getcol(tag_ids[0]).nonzero()[0]
for tid in tag_ids:
teleport_set = np.intersect1d(teleport_set, N.getcol(tid).nonzero()[0])
return teleport_set
In [6]:
def compute_topic_specific_page_rank(M, teleport_set, iterations=50, beta=0.8):
'''
Arguments
M: stochastic colummn matrix
teleport_set: list with all songs which belong to the users topics
iterations: number of iterations for pagerank
beta: 1-teleport probability
Returns
r: ndarray with the stationary probability distribution
'''
e = np.zeros((M.shape[0],1))
e[teleport_set, :] = 1.0/teleport_set.shape[0]
r = np.copy(e) # initialize for faster convergence
for iter in range(iterations):
r = beta*M*r + (1-beta)*e
del e
return r
In [7]:
def get_top_songs(r, tid_to_ids, n=5):
'''
Arguments
r: ndarray with the stationary probability distribution
tid_to_ids: dict where keys are track ids and values are the row indices from stochastic matrix
Returns
tracks: list of track ids
'''
tracks = []
top_k = np.argsort(r, axis=0)[-n:]
ids_to_tid = {v: k for k, v in tid_to_ids.items()}
for i in range(top_k.shape[0]):
tracks.append(ids_to_tid[top_k.item(i)])
return tracks
In [8]:
'''
Parameters of our model
'''
basedir=os.path.abspath('lastfm_subset')
user_tags = ['rock']
t = 0
g = 50
iterations=50
beta=0.8
n=5
In [11]:
def main():
'''
Main program, which runs topic specific pagerank on the given dataset and above parameters
'''
files, filenames = get_all_files(basedir)
M,from_, _ = get_stochastic_matrix(files, t) # get stochastic matrix and dictionary of tracks to unique ids
M = M.tocsr() # convert from COO to CSR for slicing
N, tags_ = get_songs_to_tags_matrix(files, from_, g) # get songs-to-tags matrix using dictionary of tracks-to-unique-ids
teleport_set = get_teleport_set(N, tags_, user_tags) # get teleport set using tags
output = get_top_songs(compute_topic_specific_page_rank(M, teleport_set, iterations, beta), from_, n)
titles = []
for item in output:
with open(os.path.join(filenames[item],item+'.json'), 'r') as f:
song = json.load(f)
title = song['title']
titles.append(title)
return(titles)
In [12]:
main()
Out[12]: